Studies

n_studies_total <- nrow(studies)
sprintf("Total trials (NCT_IDs): %d", n_studies_total)
## [1] "Total trials (NCT_IDs): 300214"
studies <- studies[studies$study_type=="Interventional",]
studies$study_type <- NULL
n_studies_itv <- nrow(studies)
sprintf("Interventional trials: %d (%.1f%%)", n_studies_itv, 100*n_studies_itv/n_studies_total)
## [1] "Interventional trials: 237892 (79.2%)"
studies$phase[studies$phase == "N/A"] <- NA
"===All studies, phase:"
## [1] "===All studies, phase:"
tbl <- table(studies$phase, useNA="ifany")
#sprintf("%18s: %6d", names(tbl), tbl)
knitr::kable(data.frame(tbl), caption="All studies, by phase")
All studies, by phase
Var1 Freq
Early Phase 1 2619
Phase 1 29795
Phase 1/Phase 2 10063
Phase 2 41637
Phase 2/Phase 3 4963
Phase 3 29662
Phase 4 25001
NA 94152
### Drugs (id, nct _id, name)
“id” is AACT_ID
studies <- merge(studies, dplyr::rename(drugs, drug_name = name, drug_itv_id = id), by="nct_id", all=T)
studies[["is_drug_trial"]] <- !is.na(studies$drug_itv_id)
drugs <- merge(drugs, studies, by="nct_id", all.x=T, all.y=F)
drugs <- drugs[order(drugs$name),]
"===All drugs, phase:"
## [1] "===All drugs, phase:"
tbl <- table(drugs$phase, useNA="ifany")
sprintf("%18s: %6d", names(tbl), tbl)
## [1] "     Early Phase 1:   5947" "           Phase 1: 154025"
## [3] "   Phase 1/Phase 2:  44284" "           Phase 2: 212320"
## [5] "   Phase 2/Phase 3:  17815" "           Phase 3: 161245"
## [7] "           Phase 4:  92565" "                NA:  73388"
sprintf("Drug trials (NCT_IDs): %d", length(unique(drugs$nct_id)))
## [1] "Drug trials (NCT_IDs): 129628"
sprintf("Unique drug names: %d", length(unique(drugs$name)))
## [1] "Unique drug names: 91347"

NextMove Leadmine NER

drugs_leadmine <- dplyr::rename(drugs_leadmine, id = DocName, smiles = ResolvedForm)
#
drugs <- merge(drugs, drugs_leadmine, by="id")
drugs[["resolved_structure"]] <- !is.na(drugs$smiles)
"===Drugs, resolved structure:"
## [1] "===Drugs, resolved structure:"
tbl <- table(drugs$resolved_structure)
sprintf("%18s: %6d", names(tbl), tbl)
## [1] "             FALSE:  55047" "              TRUE: 543513"
"===Drugs, overall_status:"
## [1] "===Drugs, overall_status:"
tbl <- table(drugs$overall_status)
sprintf("%18s: %6d", names(tbl), tbl)
## [1] "Active, not recruiting:  36946"  "         Completed: 337892"     
## [3] "Enrolling by invitation:   1859" "Not yet recruiting:  16986"     
## [5] "        Recruiting:  79304"      "         Suspended:   2787"     
## [7] "        Terminated:  52527"      "    Unknown status:  41516"     
## [9] "         Withdrawn:  15141"

Studies by year

## Warning: Ignoring 1 observations

### Studies by classification
### Aggregate mentions by intervention ID.

ner <- drugs_leadmine[!is.na(drugs_leadmine$smiles),] %>% group_by(id) %>% summarise(n = n())
sprintf("Mentions by intervention ID: %.1f%% (%d/%d)", 
                   100*nrow(ner)/length(unique(drugs$id)),
                   nrow(ner), length(unique(drugs$id)))
## [1] "Mentions by intervention ID: 91.9% (157862/171741)"

Aggregate mentions by trial.

drugs_leadmine <- merge(drugs_leadmine, drugs[,c("drug_itv_id", "nct_id")], by.x="id", by.y="drug_itv_id")
ner <- drugs_leadmine[!is.na(drugs_leadmine$smiles),] %>% group_by(nct_id) %>% summarise(n = n())
sprintf("Mentions by study: %.1f%% (%d/%d)", 
                   100*nrow(ner)/length(unique(drugs$nct_id)),
                   nrow(ner), length(unique(drugs$nct_id)))
## [1] "Mentions by study: 93.3% (92966/99647)"

Aggregate mentions by drug.

ner <- drugs_leadmine[!is.na(drugs_leadmine$smiles),] %>% group_by(OriginalText) %>% summarise(n = n())
sprintf("Mentions by drug name: %.1f%% (%d/%d)", 
                   100*nrow(ner)/length(unique(drugs$name)),
                   nrow(ner), length(unique(drugs$name)))
## [1] "Mentions by drug name: 19.1% (11108/58297)"

PUBCHEM:

Intervention IDs to CIDs from PubChem (via SMILES)

drug2cid <- drug2cid[!is.na(drug2cid$cid),]
drug2cid <- drug2cid[drug2cid$cid!=0,]
drug2cid <- merge(drug2cid, unique(drugs[,c("smiles","id")]), all.x=F, all.y=F, by="smiles")
drug2cid <- dplyr::rename(drug2cid, itv_id = "id")
drug2cid$smiles <- NULL
drug2cid$names <- NULL
drug2cid <- unique(drug2cid)
sprintf("Intervention IDs mapped to PubChem CIDs (via SMILES): %d", nrow(drug2cid))
## [1] "Intervention IDs mapped to PubChem CIDs (via SMILES): 153876"
write_delim(drug2cid, "../data/aact_drugs_itvid2cid.tsv", delim="\t")

InChIKeys from PubChem (via CIDs)

sprintf("PubChem CIDs with InChIKeys: %d", nrow(pubchem))
## [1] "PubChem CIDs with InChIKeys: 3801"

CHEMBL:

ChEMBL molecule IDs, and properties (via InChIKeys)

ChEMBL activities (via compounds)

ChEMBL target IDs (via activities)

IDG/TCRD:

tcrd_tgt <- read_delim("~/src/TCRD_tools/data/pharos_targets.tsv", "\t")
## Parsed with column specification:
## cols(
##   .default = col_double(),
##   idgFamily = col_character(),
##   accession = col_character(),
##   self = col_character(),
##   grantCount = col_logical(),
##   description = col_character(),
##   kind = col_character(),
##   name = col_character(),
##   r01Count = col_logical(),
##   deprecated = col_logical(),
##   grantTotalCost = col_logical(),
##   idgTDL = col_character(),
##   pubmedCount = col_logical(),
##   gene = col_character()
## )
## See spec(...) for full column specifications.
tgt <- merge(chembl_tgt, tcrd_tgt, all.x=T, all.y=F, by.x="accession", by.y="accession")
sprintf("ChEMBL target proteins mapped to TCRD (human): %d",
    nrow(tgt[!is.na(tgt$idgTDL),]))
## [1] "ChEMBL target proteins mapped to TCRD (human): 1806"
setDT(tgt)
sprintf("Organisms: %d", length(unique(tgt$organism)))
## [1] "Organisms: 187"
"===Targets by organism (top 10):"
## [1] "===Targets by organism (top 10):"
org_counts <- tgt[, .(.N), by = "organism"][order(-N)][1:10, ]
sprintf("%28s: %6d", org_counts$organism, org_counts$N)
##  [1] "                Homo sapiens:   1806"
##  [2] "           Rattus norvegicus:    529"
##  [3] "                Mus musculus:    238"
##  [4] "                  Bos taurus:     98"
##  [5] "                  Sus scrofa:     36"
##  [6] "             Cavia porcellus:     26"
##  [7] "       Escherichia coli K-12:     19"
##  [8] "       Oryctolagus cuniculus:     18"
##  [9] "            Escherichia coli:     17"
## [10] "  Mycobacterium tuberculosis:     17"
"===Targets, TDL for human:"
## [1] "===Targets, TDL for human:"
tdl_counts <- tgt[organism == "Homo sapiens", .(.N), by = "idgTDL"]
sprintf("%8s: %6d", tdl_counts$idgTDL, tdl_counts$N)
## [1] "    Tbio:    224" "   Tchem:    868" "   Tdark:      7"
## [4] "   Tclin:    707"